import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
#
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import silhouette_score
from tqdm import tqdm
from itertools import combinations
import optuna
from optuna.visualization import plot_optimization_history
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import optuna
tqdm.pandas()
import plotly.io as pio
pio.renderers.default='notebook'
from optuna.visualization import *
pd.set_option('display.max_columns', 500)
drop_cols = ['hashedid','geom','geog','startdatelocal','startlatapprox','startlngapprox','activity_id','row_number']
df = pd.read_pickle('./data/df_2017-5k.pkl')#,index_col=0)
df = df.drop(drop_cols,axis=1)
# removing athletes that don't have heart rate values
df = df[df.heartrate_100.str.len()>0].reset_index(drop=True)
df.head(1)
df = df[df.totaldistance<45e3].copy()
df['group'] = None
df.loc[df.totaldistance<42000,'group'] = 'DNF'
cols = [x for x in df.columns if '100' in x]
rows = 420
df[cols] = df[cols].apply(lambda x: x.map(lambda y: y[:rows]))
# calculating finish time
df['finish_time'] = df['time_100'].map(lambda x: x[-1])
df['finish_time_minutes'] = pd.to_datetime(df['finish_time'], unit='s').dt.strftime('%M:%S')
df['avg_heart_rate'] = df['heartrate_100'].map(lambda x: np.array(x).mean())
In our implementation, we utilize the K-means algorithm to cluster athletes based on their performance metrics such as elevation, heart rate, distance, and total time. Scaling is a crucial factor in clustering, and we employ the Standard Scaler to address this requirement.
To optimize our algorithm, we tuned several hyperparameters using Optuna.
This parameter determines how the centroids are initialized. Our implementation provides two options: "k-means++" and "random". "k-means++" is a smarter initialization method that selects initial centroids far from each other, resulting in better clustering outcomes. On the other hand, "random" initialization randomly chooses k data points as initial centroids.
This parameter defines the maximum number of iterations that the K-means algorithm can run. The algorithm terminates when either the maximum iterations are reached or the convergence criterion is satisfied.
The tolerance parameter controls the convergence criterion of the K-means algorithm. The algorithm stops when the change in the within-cluster sum of squares (WSS) between iterations is less than the specified tolerance value. In our implementation, we optimize this parameter using Optuna, and you have set the log=True option to explore a wide range of values on a logarithmic scale.
In this section, we present the results of training a K-means model using the Optuna hyperparameter optimization library.
For this project, we divided athletes into three distinct groups: fast, medium, and slow, using K-means clustering.
def objective(trial):
# Define the hyperparameters to optimize
init_method = trial.suggest_categorical("init_method", ["k-means++", "random"])
max_iter = trial.suggest_int("max_iter", 100, 1000)
tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)
# Fit the KMeans model with the hyperparameters
kmeans = KMeans(n_clusters=3, init=init_method, max_iter=max_iter, tol=tol, n_init=10)
kmeans.fit(X)
# Calculate the silhouette score as the optimization metric
labels = kmeans.predict(X)
score = silhouette_score(X, labels)
return score
cluster_cols = ['totaldistance','finish_time']
# df_cluster = df_cluster[col_comb]
df_cluster = df[df['group'].isna()][cluster_cols]
scaler = MinMaxScaler()
df_cluster[cluster_cols] = scaler.fit_transform(df_cluster)
X = df_cluster
study = optuna.create_study(direction="maximize")
# Optimize the hyperparameters
study.optimize(objective, n_trials=100)
# Print the best hyperparameters and the optimization metric
print("Best hyperparameters: ", study.best_params)
print("Best score: ", study.best_value)
[I 2023-05-21 15:41:45,434] A new study created in memory with name: no-name-41e1fc78-48e9-47db-93d2-2b844e409770 C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:46,902] Trial 0 finished with value: 0.43133664591161675 and parameters: {'init_method': 'random', 'max_iter': 587, 'tol': 1.5734372854280852e-05}. Best is trial 0 with value: 0.43133664591161675. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:48,326] Trial 1 finished with value: 0.43052860620939215 and parameters: {'init_method': 'random', 'max_iter': 938, 'tol': 0.0018953382982893248}. Best is trial 0 with value: 0.43133664591161675. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:49,713] Trial 2 finished with value: 0.3409565704347823 and parameters: {'init_method': 'k-means++', 'max_iter': 409, 'tol': 0.027254784078630138}. Best is trial 0 with value: 0.43133664591161675. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:51,108] Trial 3 finished with value: 0.4321049001896423 and parameters: {'init_method': 'random', 'max_iter': 260, 'tol': 0.0007176690096820972}. Best is trial 3 with value: 0.4321049001896423. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:52,512] Trial 4 finished with value: 0.43196716398960044 and parameters: {'init_method': 'random', 'max_iter': 136, 'tol': 0.017184690956834534}. Best is trial 3 with value: 0.4321049001896423. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:53,928] Trial 5 finished with value: 0.4358925383652514 and parameters: {'init_method': 'random', 'max_iter': 618, 'tol': 0.01043885145036651}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:55,391] Trial 6 finished with value: 0.432588849686833 and parameters: {'init_method': 'k-means++', 'max_iter': 283, 'tol': 0.00019299368633496613}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:56,754] Trial 7 finished with value: 0.43110530254332574 and parameters: {'init_method': 'k-means++', 'max_iter': 408, 'tol': 0.00342528274492568}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:58,194] Trial 8 finished with value: 0.43194141166773575 and parameters: {'init_method': 'k-means++', 'max_iter': 296, 'tol': 0.005182855174800681}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:41:59,597] Trial 9 finished with value: 0.43072754735175267 and parameters: {'init_method': 'k-means++', 'max_iter': 141, 'tol': 0.019274847304818335}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:01,012] Trial 10 finished with value: 0.3355140748379241 and parameters: {'init_method': 'random', 'max_iter': 707, 'tol': 0.057167557691404323}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:02,491] Trial 11 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 773, 'tol': 0.00031100774278164926}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:03,919] Trial 12 finished with value: 0.4321049001896423 and parameters: {'init_method': 'random', 'max_iter': 547, 'tol': 0.00021219444300452466}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:05,395] Trial 13 finished with value: 0.43431055592471046 and parameters: {'init_method': 'k-means++', 'max_iter': 548, 'tol': 0.09021479274642404}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:06,846] Trial 14 finished with value: 0.3345657387247144 and parameters: {'init_method': 'random', 'max_iter': 625, 'tol': 0.08318354956728917}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:08,300] Trial 15 finished with value: 0.42690664645117293 and parameters: {'init_method': 'k-means++', 'max_iter': 830, 'tol': 0.008348355650925707}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:09,871] Trial 16 finished with value: 0.42695027147860704 and parameters: {'init_method': 'random', 'max_iter': 473, 'tol': 0.08349293098806082}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:11,387] Trial 17 finished with value: 0.3351085843553992 and parameters: {'init_method': 'k-means++', 'max_iter': 682, 'tol': 0.03194424062119821}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:12,935] Trial 18 finished with value: 0.33316556203552206 and parameters: {'init_method': 'k-means++', 'max_iter': 501, 'tol': 0.0110472460688602}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:14,391] Trial 19 finished with value: 0.34286547467160644 and parameters: {'init_method': 'random', 'max_iter': 948, 'tol': 0.03730417928263107}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:15,900] Trial 20 finished with value: 0.4308824906452762 and parameters: {'init_method': 'random', 'max_iter': 806, 'tol': 0.009730534219672736}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:17,389] Trial 21 finished with value: 0.43116315841951824 and parameters: {'init_method': 'k-means++', 'max_iter': 327, 'tol': 0.002337737965660367}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:18,849] Trial 22 finished with value: 0.3494779965358338 and parameters: {'init_method': 'k-means++', 'max_iter': 211, 'tol': 0.04693144321237858}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:20,351] Trial 23 finished with value: 0.3400839902614658 and parameters: {'init_method': 'k-means++', 'max_iter': 386, 'tol': 0.09412350964573173}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:21,829] Trial 24 finished with value: 0.43160354706982407 and parameters: {'init_method': 'k-means++', 'max_iter': 643, 'tol': 0.005676184949306793}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:23,318] Trial 25 finished with value: 0.3428382802701592 and parameters: {'init_method': 'k-means++', 'max_iter': 506, 'tol': 0.016205825131276733}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:24,782] Trial 26 finished with value: 0.33978207236234537 and parameters: {'init_method': 'k-means++', 'max_iter': 737, 'tol': 0.035803960060191244}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:26,290] Trial 27 finished with value: 0.43239011909298825 and parameters: {'init_method': 'k-means++', 'max_iter': 874, 'tol': 0.0010605150385244995}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:27,779] Trial 28 finished with value: 0.4274654173221567 and parameters: {'init_method': 'random', 'max_iter': 560, 'tol': 0.020640448929865485}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:29,310] Trial 29 finished with value: 0.4321049001896423 and parameters: {'init_method': 'random', 'max_iter': 611, 'tol': 5.544477997913555e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:30,809] Trial 30 finished with value: 0.4343877645144766 and parameters: {'init_method': 'k-means++', 'max_iter': 216, 'tol': 1.53133887109628e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:32,338] Trial 31 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 182, 'tol': 2.6270650115003265e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:33,856] Trial 32 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 238, 'tol': 2.7040450836720958e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:35,383] Trial 33 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 335, 'tol': 1.0525560105936893e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:36,923] Trial 34 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 105, 'tol': 9.446066894239697e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:38,439] Trial 35 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 365, 'tol': 0.0009805989734124323}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:39,984] Trial 36 finished with value: 0.4321049001896423 and parameters: {'init_method': 'random', 'max_iter': 424, 'tol': 0.000419814468672918}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:41,529] Trial 37 finished with value: 0.4305865668952844 and parameters: {'init_method': 'k-means++', 'max_iter': 295, 'tol': 0.00013213775642216308}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:43,049] Trial 38 finished with value: 0.43190782634674485 and parameters: {'init_method': 'random', 'max_iter': 453, 'tol': 0.0015965263757644617}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:44,554] Trial 39 finished with value: 0.435108693313064 and parameters: {'init_method': 'k-means++', 'max_iter': 287, 'tol': 0.0005034649719638121}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:46,076] Trial 40 finished with value: 0.4324434164041532 and parameters: {'init_method': 'k-means++', 'max_iter': 162, 'tol': 0.0030626163982188266}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:47,669] Trial 41 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 291, 'tol': 0.0007024378803794135}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:49,197] Trial 42 finished with value: 0.33965614966323704 and parameters: {'init_method': 'k-means++', 'max_iter': 239, 'tol': 0.00042945827232339123}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:50,791] Trial 43 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 262, 'tol': 0.00018009610263633154}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:52,397] Trial 44 finished with value: 0.432588849686833 and parameters: {'init_method': 'k-means++', 'max_iter': 193, 'tol': 0.0018219691812370918}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:53,984] Trial 45 finished with value: 0.42839330719162905 and parameters: {'init_method': 'random', 'max_iter': 666, 'tol': 0.004202691790926479}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:55,558] Trial 46 finished with value: 0.4343877645144766 and parameters: {'init_method': 'k-means++', 'max_iter': 110, 'tol': 7.493855346883957e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:57,108] Trial 47 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 125, 'tol': 6.588205379349974e-05}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:42:58,659] Trial 48 finished with value: 0.34724728921821735 and parameters: {'init_method': 'k-means++', 'max_iter': 135, 'tol': 0.06327051373387946}. Best is trial 5 with value: 0.4358925383652514. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:00,255] Trial 49 finished with value: 0.4378635322074878 and parameters: {'init_method': 'random', 'max_iter': 575, 'tol': 0.027542688694427254}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:01,827] Trial 50 finished with value: 0.4303424364237128 and parameters: {'init_method': 'random', 'max_iter': 599, 'tol': 2.654665194091566e-05}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:03,380] Trial 51 finished with value: 0.3538201519793637 and parameters: {'init_method': 'random', 'max_iter': 562, 'tol': 0.025865301980658784}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:04,842] Trial 52 finished with value: 0.3527640562830705 and parameters: {'init_method': 'random', 'max_iter': 527, 'tol': 0.05118273962637702}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:06,343] Trial 53 finished with value: 0.4280016934616923 and parameters: {'init_method': 'random', 'max_iter': 724, 'tol': 0.014623738995483686}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:07,826] Trial 54 finished with value: 0.4199369060159818 and parameters: {'init_method': 'random', 'max_iter': 222, 'tol': 0.07057973122744866}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:09,408] Trial 55 finished with value: 0.42726962053177936 and parameters: {'init_method': 'random', 'max_iter': 652, 'tol': 0.007215343319913173}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:11,110] Trial 56 finished with value: 0.4258974956046676 and parameters: {'init_method': 'k-means++', 'max_iter': 467, 'tol': 0.025039453358270475}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:12,660] Trial 57 finished with value: 0.3446024738936803 and parameters: {'init_method': 'k-means++', 'max_iter': 167, 'tol': 0.09953211104117263}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:14,228] Trial 58 finished with value: 0.3393663797752192 and parameters: {'init_method': 'random', 'max_iter': 773, 'tol': 0.04458369527490244}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:15,844] Trial 59 finished with value: 0.42561277869228764 and parameters: {'init_method': 'k-means++', 'max_iter': 351, 'tol': 0.013134826711981889}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:17,448] Trial 60 finished with value: 0.3451360441111852 and parameters: {'init_method': 'k-means++', 'max_iter': 982, 'tol': 0.03278533023778418}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:19,078] Trial 61 finished with value: 0.4343877645144766 and parameters: {'init_method': 'k-means++', 'max_iter': 417, 'tol': 0.0003056778065589787}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:20,739] Trial 62 finished with value: 0.4321049001896423 and parameters: {'init_method': 'k-means++', 'max_iter': 420, 'tol': 0.0002722641766757945}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:22,391] Trial 63 finished with value: 0.43156678669853255 and parameters: {'init_method': 'k-means++', 'max_iter': 574, 'tol': 0.010354596491939826}. Best is trial 49 with value: 0.4378635322074878. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:24,048] Trial 64 finished with value: 0.43905868605599185 and parameters: {'init_method': 'k-means++', 'max_iter': 490, 'tol': 0.023214734737419047}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:25,622] Trial 65 finished with value: 0.4381843844942808 and parameters: {'init_method': 'k-means++', 'max_iter': 495, 'tol': 0.019983074237445563}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:27,268] Trial 66 finished with value: 0.4354246432737091 and parameters: {'init_method': 'k-means++', 'max_iter': 509, 'tol': 0.020837677499998854}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:28,816] Trial 67 finished with value: 0.43701328658015187 and parameters: {'init_method': 'random', 'max_iter': 499, 'tol': 0.00665829390090303}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:30,408] Trial 68 finished with value: 0.4367182352972571 and parameters: {'init_method': 'random', 'max_iter': 499, 'tol': 0.020035659066041857}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:31,995] Trial 69 finished with value: 0.33202228250286997 and parameters: {'init_method': 'random', 'max_iter': 519, 'tol': 0.01900311292766893}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:33,542] Trial 70 finished with value: 0.42895630728790063 and parameters: {'init_method': 'random', 'max_iter': 474, 'tol': 0.013323379224548272}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:35,128] Trial 71 finished with value: 0.34029635938646213 and parameters: {'init_method': 'random', 'max_iter': 490, 'tol': 0.023340288385034536}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:36,680] Trial 72 finished with value: 0.42820050868807746 and parameters: {'init_method': 'random', 'max_iter': 530, 'tol': 0.017901414918847846}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:38,215] Trial 73 finished with value: 0.43037317359807703 and parameters: {'init_method': 'random', 'max_iter': 445, 'tol': 0.008125183561931856}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:39,752] Trial 74 finished with value: 0.33706479510085036 and parameters: {'init_method': 'random', 'max_iter': 614, 'tol': 0.0332811564000976}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:41,319] Trial 75 finished with value: 0.4352115264726462 and parameters: {'init_method': 'random', 'max_iter': 586, 'tol': 0.01154866220536216}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:42,857] Trial 76 finished with value: 0.42606017370958005 and parameters: {'init_method': 'random', 'max_iter': 544, 'tol': 0.01189717603354288}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:44,395] Trial 77 finished with value: 0.42783312469028073 and parameters: {'init_method': 'random', 'max_iter': 594, 'tol': 0.016472094588508006}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:45,940] Trial 78 finished with value: 0.42920456085637965 and parameters: {'init_method': 'random', 'max_iter': 691, 'tol': 0.006630443547302084}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:47,505] Trial 79 finished with value: 0.4295124831689022 and parameters: {'init_method': 'random', 'max_iter': 637, 'tol': 0.008746998746930214}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:49,036] Trial 80 finished with value: 0.42797051211683806 and parameters: {'init_method': 'random', 'max_iter': 574, 'tol': 0.020826324406174268}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:50,579] Trial 81 finished with value: 0.4360994745913773 and parameters: {'init_method': 'random', 'max_iter': 384, 'tol': 0.028076000392057713}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:52,157] Trial 82 finished with value: 0.4278209452429766 and parameters: {'init_method': 'random', 'max_iter': 496, 'tol': 0.027694887622102066}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:53,686] Trial 83 finished with value: 0.3398697647725439 and parameters: {'init_method': 'random', 'max_iter': 378, 'tol': 0.040720192072048966}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:55,249] Trial 84 finished with value: 0.42581608916115987 and parameters: {'init_method': 'random', 'max_iter': 442, 'tol': 0.010572789690874475}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:56,826] Trial 85 finished with value: 0.42568074601651285 and parameters: {'init_method': 'random', 'max_iter': 393, 'tol': 0.0165935625545255}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:43:58,429] Trial 86 finished with value: 0.4249384848104348 and parameters: {'init_method': 'random', 'max_iter': 502, 'tol': 0.029482876574603877}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:00,023] Trial 87 finished with value: 0.4282824942953101 and parameters: {'init_method': 'random', 'max_iter': 533, 'tol': 0.02184455762442858}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:01,629] Trial 88 finished with value: 0.3369148811697588 and parameters: {'init_method': 'random', 'max_iter': 555, 'tol': 0.03930033782644164}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:03,215] Trial 89 finished with value: 0.4328823151780424 and parameters: {'init_method': 'random', 'max_iter': 476, 'tol': 0.014212419411728966}. Best is trial 64 with value: 0.43905868605599185. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:04,816] Trial 90 finished with value: 0.43960727119148074 and parameters: {'init_method': 'random', 'max_iter': 620, 'tol': 0.052127252658930465}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:06,433] Trial 91 finished with value: 0.3396776783683072 and parameters: {'init_method': 'random', 'max_iter': 578, 'tol': 0.05052187073961138}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:08,029] Trial 92 finished with value: 0.42886905275803794 and parameters: {'init_method': 'random', 'max_iter': 621, 'tol': 0.024470814541982515}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:09,633] Trial 93 finished with value: 0.33622142869096117 and parameters: {'init_method': 'random', 'max_iter': 677, 'tol': 0.0578566414422294}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:11,271] Trial 94 finished with value: 0.4296387283065301 and parameters: {'init_method': 'random', 'max_iter': 596, 'tol': 0.03235800868132299}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:12,880] Trial 95 finished with value: 0.4274982771033511 and parameters: {'init_method': 'random', 'max_iter': 514, 'tol': 0.011016338735183347}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:14,485] Trial 96 finished with value: 0.4293083111283291 and parameters: {'init_method': 'random', 'max_iter': 642, 'tol': 0.016642134489794652}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:16,096] Trial 97 finished with value: 0.43284858879358806 and parameters: {'init_method': 'random', 'max_iter': 455, 'tol': 0.02021423617840185}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:17,699] Trial 98 finished with value: 0.42784221299678754 and parameters: {'init_method': 'random', 'max_iter': 548, 'tol': 0.03837882557656501}. Best is trial 90 with value: 0.43960727119148074. C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. [I 2023-05-21 15:44:19,269] Trial 99 finished with value: 0.3245876231853652 and parameters: {'init_method': 'random', 'max_iter': 436, 'tol': 0.0768559651045767}. Best is trial 90 with value: 0.43960727119148074.
Best hyperparameters: {'init_method': 'random', 'max_iter': 620, 'tol': 0.052127252658930465}
Best score: 0.43960727119148074
Optuna is a hyperparameter optimization library that utilizes a Bayesian optimization strategy to discover the optimal hyperparameters for a given machine learning model. The library is compatible with various machine learning frameworks, such as scikit-learn, PyTorch, and TensorFlow. Optuna offers several visualization tools to facilitate the analysis of hyperparameter optimization results.
Two commonly used visualization tools in Optuna are "plot parameter importance" and "plot optimization history". The "plot parameter importance" function creates a bar chart to display the relative importance of each hyperparameter in the optimization process. The importance scores are calculated based on the hyperparameters' contributions to the objective function during optimization. Higher importance scores indicate a greater impact on the model's performance.
Conversely, the "plot optimization history" function generates a line chart illustrating the objective function's values over time during the optimization process. The objective function value indicates the model's performance with the current hyperparameter settings. This chart aids in visualizing the optimization progress and identifying the optimal hyperparameter configuration.
fig = plot_optimization_history(study)
fig.update_layout(
title='Optimization History Plot',
xaxis_title='Trials',
yaxis_title='Objective Value',
legend_title='Objective',
font=dict(
family="Courier New, monospace",
size=18,
color="#7f7f7f"
)
)
fig.show()
plot_param_importances(study)
plot_edf(study)
plot_contour(study)
NUM_GROUPS = 3 # slow, medium , fast
RANDOM_STATE = 0
kmeans = KMeans(NUM_GROUPS,random_state=RANDOM_STATE,init=study.best_params['init_method']
, max_iter=study.best_params['max_iter'],
tol = study.best_params['tol'],
n_init=10)
cluster = kmeans.fit_predict(df_cluster)
unique, counts = np.unique(cluster, return_counts=True)
print(counts)
kmeans.inertia_
C:\Users\kumar\anaconda3\Desktop\jupyterSuman\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7.
[148 490 954]
14.948250401526149
# cluster_labels = {0:'fast',1:'slow',2:'fast'}
# groups = [cluster_labels[x] for x in kmeans.labels_]
cluster_centers = np.argsort(kmeans.cluster_centers_[:,1])
cluster_labels = {}
_athlete_groups = ['fast','medium','slow']
for center,label in zip(cluster_centers,_athlete_groups):
cluster_labels[center] = label
groups = [cluster_labels[x] for x in kmeans.labels_]
df.loc[df_cluster.index,'group'] = groups
The resulting cluster plot shows that there is a clear distinction between the athletes of three categories, which is indicative of the effectiveness of our K means clustering approach. Each group has a unique set of characteristics that distinguish them from the others, and the decision boundary between each cluster is clearly defined. This shows that the clustering algorithm was able to successfully identify the different performance groups based on the athletes' performance metrics.
import plotly.express as px
df_groups = df[df.group!='DNF'].copy()
fig = px.scatter(df_groups, x='totaldistance', y='finish_time', color='group',
symbol='group', opacity=0.7,
)
fig.update_layout(
title=dict(text='Cluster Analysis', font=dict(size=20)),
xaxis=dict(title=dict(text='Distance (km)', font=dict(size=16))),
yaxis=dict(title=dict(text='Finish Time', font=dict(size=16))),
legend=dict(title=dict(text='Groups', font=dict(size=16)), font=dict(size=14), bgcolor='white', bordercolor='gray', borderwidth=1),
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(size=14))
fig.show()
del df_groups
max_hr = 190
limits = np.arange(0.5, 1.01, 0.1)
zone_limits = max_hr*limits
idxs = np.arange(0, 2).reshape(1, -1) + np.arange(0, 5).reshape(-1, 1)
zone_limits = zone_limits[idxs]
zones_df = pd.DataFrame(zone_limits, columns=['min_h', 'max_h'])
zones_df['height'] = zones_df['max_h'] - zones_df['min_h']
zones_df['zone'] = zones_df.index + 1
zones_df['zone'] = 'zone ' + zones_df.zone.astype('str')
def get_zone(heart_rates):
zones = []
for hrate in heart_rates:
_zone_df = zones_df[(zones_df.max_h >= hrate) & (zones_df.min_h <= hrate)]
if not _zone_df.empty:
zones.append(_zone_df['zone'].iloc[-1])
continue
zones.append(zones_df['zone'].iloc[0])
return zones
df['zones'] = df['heartrate_100'].progress_map(get_zone)
100%|██████████████████████████████████████████████████████████████████████████████| 1678/1678 [05:01<00:00, 5.57it/s]
def get_zone_hrate(hrate):
_zone_df = zones_df[(zones_df.max_h >= hrate) & (zones_df.min_h <= hrate)]
if not _zone_df.empty:
return _zone_df['zone'].iloc[-1]
return None
df['zone'] = df.avg_heart_rate.map(lambda x:get_zone_hrate(x))
df[['avg_heart_rate','group']]
| avg_heart_rate | group | |
|---|---|---|
| 0 | 176.524332 | fast |
| 1 | 143.649048 | slow |
| 2 | 161.168603 | fast |
| 3 | 149.099813 | fast |
| 4 | 168.673731 | slow |
| ... | ... | ... |
| 1675 | 163.298888 | fast |
| 1676 | 147.175585 | fast |
| 1677 | 159.119735 | fast |
| 1678 | 148.863774 | medium |
| 1679 | 165.131478 | fast |
1678 rows × 2 columns
The graph below shows the cluster histogram for heart rate. We have divided the heart rate into bins for cluster analysis. It is evident from the visualisation that the distribution of fast group for heart rate is more skewed towards the right as compared to the other groups. Which shows that there is a clear distinction between the heart rate distributions of different groups.
import matplotlib.pyplot as plt
from matplotlib import rcParams
# Set LaTeX serif font
rcParams['font.family'] = 'serif'
rcParams['font.serif'] = ['Times New Roman']
df_cluster = df[df['group'] != 'DNF'].copy()
avg_heart_rate = df_cluster['avg_heart_rate']
group = df_cluster['group']
# Plot the histogram
fig, ax = plt.subplots(figsize=(8, 6)) # Adjust the figure size as desired
bins = np.linspace(100, 180, 40)
# Iterate over unique groups (clusters) and plot histograms for each group
unique_groups = group.unique()
colors = plt.cm.get_cmap('tab10', len(unique_groups)) # Get a colormap with a color for each cluster
for i, g in enumerate(unique_groups):
subset = avg_heart_rate[group == g]
ax.hist(subset, bins=bins, alpha=0.7, label=f"Cluster {g}", color=colors(i))
ax.legend()
plt.xlabel('Average heart rate')
plt.ylabel('Count')
plt.grid(False) # Remove the grid
plt.tight_layout() # Adjust spacing between subplots
plt.savefig('cluster_histogram.pdf', format='pdf') # Save the plot as a PDF file
plt.show()
The following plot visualizes the distribution of athletes across different heart rate zones and their corresponding distance covered in kilometers. For each 100 meters of distance, we compute the area of each zone. This helps us observe how athletes' heart rates change over the entire marathon duration. Analyzing the area of different zones can provide valuable insights into the impact of heart rate on performance and identify patterns among athletes of different groups.
Analyzing the area of different heart rate zones can also be useful in developing training and pacing strategies. Along with visualizing the results for all athletes, we also wanted to examine how different athlete groups behave throughout the marathon. We calculated zones for each 100 meters for every athlete and grouped them based on clustering, as explained in the later section. By dividing athletes into groups, we can study their heart rate patterns.
We observed that some athletes who didn't finish the race exhibited significant heart rate fluctuations without a consistent rhythm. Additionally, a higher proportion of these athletes had higher heart rates compared to athletes from other zones since the start of the race. This insight highlights the impact of polarized training discussed in earlier chapters.
Furthermore, the Fast group started steadily but maintained a consistently high heart rate in Zone 5 during the later stages of the race. Conversely, the Slow group spent most of their time in Zone 3 and Zone 4, indicating that they did not push themselves hard enough throughout the marathon.
df_exploded = df['zones'].apply(pd.Series).T
new_df = pd.DataFrame()
for i in range(len(df_exploded)):
row_counts = df_exploded.iloc[i,:].value_counts()
# display(row_counts)
new_df = pd.concat([new_df, row_counts], axis=1)
new_df = new_df.fillna(0).T
fig = go.Figure()
# Convert distance to kilometers
new_df.index = new_df.index.map(lambda x: x*100/1000)
# Add traces for each zone
colors = ['rgba(255, 127, 14, 0.5)', 'rgba(44, 160, 44, 0.5)', 'rgba(214, 39, 40, 0.5)', 'rgba(31, 119, 180, 0.5)', 'rgba(148, 103, 189, 0.5)']
for i, col in enumerate(new_df.columns):
fig.add_trace(go.Scatter(x=new_df.index, y=new_df[col], mode='lines', stackgroup='one', fillcolor=colors[i], name=col, line=dict(color=colors[i], width=2)))
# Update layout settings
colors = ['rgba(255, 127, 14, 0.5)', 'rgba(44, 160, 44, 0.5)', 'rgba(214, 39, 40, 0.5)', 'rgba(31, 119, 180, 0.5)', 'rgba(148, 103, 189, 0.5)']
fig.update_layout(title=dict(text='Zone Counts', font=dict(size=20)),
xaxis=dict(title=dict(text='Distance (km)', font=dict(size=16))),
yaxis=dict(title=dict(text='Athletes', font=dict(size=16))),
legend=dict(title=dict(text='Zones', font=dict(size=16)), font=dict(size=14), bgcolor='white', bordercolor='gray', borderwidth=1),
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(size=14))
fig.show()
def plot_group_zones(df,group='all'):
df_exploded = df['zones'].apply(pd.Series).T
df_zones_distance = pd.DataFrame()
for i in range(len(df_exploded)):
row_counts = df_exploded.iloc[i,:].value_counts()
# display(row_counts)
df_zones_distance = pd.concat([df_zones_distance, row_counts], axis=1)
df_zones_distance = df_zones_distance.fillna(0).T
fig = go.Figure()
# Convert distance to kilometers
df_zones_distance.index = df_zones_distance.index.map(lambda x: x*100/1000)
# Add traces for each zone
colors = ['rgba(255, 127, 14, 0.5)', 'rgba(44, 160, 44, 0.5)', 'rgba(214, 39, 40, 0.5)', 'rgba(31, 119, 180, 0.5)', 'rgba(148, 103, 189, 0.5)']
for i, col in enumerate(df_zones_distance.columns):
fig.add_trace(go.Scatter(x=df_zones_distance.index,
y=df_zones_distance[col], mode='lines',
stackgroup='one', fillcolor=colors[i],
name=col, line=dict(color=colors[i], width=2)))
# Update layout settings
colors = ['rgba(255, 127, 14, 0.5)', 'rgba(44, 160, 44, 0.5)', 'rgba(214, 39, 40, 0.5)', 'rgba(31, 119, 180, 0.5)', 'rgba(148, 103, 189, 0.5)']
fig.update_layout(xaxis=dict(title=dict(text='Distance (km)', font=dict(size=16)),
showline=True, linewidth=1, linecolor='black', mirror=True),
yaxis=dict(title=dict(text='Athletes', font=dict(size=16)),
showline=True, linewidth=1, linecolor='black', mirror=True),
legend=dict(title=dict(text='Zones', font=dict(size=16)), font=dict(size=14), bgcolor='white', bordercolor='gray', borderwidth=1),
plot_bgcolor='white',
paper_bgcolor='white',
margin=dict(t=0),
font=font)
return fig
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Define the font style
font = dict(family='serif', size=12)
# Create a 2x2 grid of subplots
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Slow Group", "Medium Group", "Fast Group", "DNF Group"),
vertical_spacing=0.15)
# Add each plot as a subplot
groups = ['slow', 'medium', 'fast', 'DNF']
for i, group in enumerate(groups):
df_group = df[df.group==group]
subplot_row = (i // 2) + 1
subplot_col = (i % 2) + 1
fig_data = plot_group_zones(df_group).data
for j, data in enumerate(fig_data):
if i == 0:
fig.add_trace(data, row=subplot_row, col=subplot_col)
else:
data.showlegend = False
fig.add_trace(data, row=subplot_row, col=subplot_col)
fig.update_xaxes(title_text='Distance (km)', row=subplot_row, col=subplot_col, title_font=font)
fig.update_yaxes(title_text='Athletes', row=subplot_row, col=subplot_col, title_font=font)
fig.update_layout(width=800, height=600,
font=font,
# margin=dict(l=50, r=50, b=50, t=80),
legend=dict(font=dict(size=16, family='serif'),
bgcolor='white', bordercolor='gray', borderwidth=1),
showlegend=True,
xaxis_showgrid=False,
yaxis_showgrid=False,
template='plotly_white',
)
# Set the traceorder to reversed
fig.update_layout(dict(legend=dict(traceorder='reversed')),margin=dict(t=0))
fig.show()
col = 'heartrate_100'
out_col = 'finish_time'
df_time = df[[col]]
df_time, finish_time = df_time[col].apply(pd.Series), df[out_col]
df_time[out_col] = finish_time
df_time = df_time.interpolate(method='cubic')
X = df_time.iloc[:, :-1]
y = df_time.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
# applying standard scalar
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test = scaler.transform(y_test.reshape(-1, 1)).flatten()
rows = X_train.shape[1]//2
X_train = X_train[:,:rows]
X_test = X_test[:,:rows]
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0
The provided code snippet defines a function svm_objective that is used for optimizing the hyperparameters of a Support Vector Machine (SVM) model using Optuna.
The function takes a trial object as input, which is provided by Optuna and is used to suggest different values for the hyperparameters to explore during the optimization process. The hyperparameters being optimized are:
C: The regularization parameter that controls the trade-off between maximizing the margin and minimizing the classification error.epsilon: The epsilon parameter that determines the margin around the regression line within which no penalty is associated with errors.kernel: The kernel function to be used in the SVM model. The available options are 'linear', 'poly', 'rbf', and 'sigmoid'.Inside the function, a Support Vector Regression (SVR) model is initialized with the suggested hyperparameters. The data is then split into training and validation sets using k-fold cross-validation. The SVR model is trained on the training data, and the mean absolute error (MAE) is calculated by comparing the predicted values on the validation data with the actual values.
The MAE values for each fold are stored in a list, and the function returns the average MAE across all folds as the objective value for the optimization process.
def svm_objective(trial):
c = trial.suggest_float('C', 1e1, 1e2)
epsilon = trial.suggest_float('epsilon', 1e1, 1e2)
kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
# degree = trial.suggest_int('degree', 1, 5) if kernel == 'poly' else None
# gamma = trial.suggest_float('gamma', 1e-4, 1e2) if kernel in ['rbf', 'poly', 'sigmoid'] else None
# shrinking = trial.suggest_categorical('shrinking', [True, False])
svr = SVR(C=c, epsilon=epsilon, kernel=kernel)#, shrinking=shrinking)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
mae_list = []
for train_index, val_index in kf.split(X_train):
X_train_kf, X_val = X_train[train_index], X_train[val_index]
y_train_kf, y_val = y_train[train_index], y_train[val_index]
svr.fit(X_train_kf, y_train_kf)
y_pred = svr.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_list.append(mae)
return np.mean(mae_list)
# Run Optuna to find the best SVM hyperparameters
svm_study = optuna.create_study(direction='minimize')
svm_study.optimize(svm_objective, n_trials=100)
# Define the SVM model with the best hyperparameters
svm_model = SVR(C=svm_study.best_params['C'], epsilon=svm_study.best_params['epsilon'])
# Fit the SVM model on the training data
svm_model.fit(X_train, y_train)
# Use the SVM model to make predictions on the test data
svm_y_pred = svm_model.predict(X_test)
# Calculate the SVM model's RMSE and MAE on the test data
svm_rmse = mean_squared_error(y_test, svm_y_pred, squared=False)
svm_mae = mean_absolute_error(y_test, svm_y_pred)
results = {}
name = 'svm'
results[name] = {"rmse": svm_rmse, "mae": svm_mae}
[I 2023-05-21 15:49:34,734] A new study created in memory with name: no-name-6723ae87-38de-4dad-8886-20354ae90942 [I 2023-05-21 15:49:34,768] Trial 0 finished with value: 0.9480527597599474 and parameters: {'C': 48.98766668884487, 'epsilon': 32.94013059240258, 'kernel': 'rbf'}. Best is trial 0 with value: 0.9480527597599474. [I 2023-05-21 15:49:34,794] Trial 1 finished with value: 0.9480527597599464 and parameters: {'C': 55.879299743033506, 'epsilon': 91.38157623845427, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9480527597599464. [I 2023-05-21 15:49:34,817] Trial 2 finished with value: 0.9480527597599474 and parameters: {'C': 35.4846605014333, 'epsilon': 28.827858470011854, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9480527597599464. [I 2023-05-21 15:49:34,838] Trial 3 finished with value: 0.9480527597599477 and parameters: {'C': 21.916366635730093, 'epsilon': 64.47969734523696, 'kernel': 'poly'}. Best is trial 1 with value: 0.9480527597599464. [I 2023-05-21 15:49:34,861] Trial 4 finished with value: 0.9480527597599462 and parameters: {'C': 46.68900734868066, 'epsilon': 76.91881883962348, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:34,881] Trial 5 finished with value: 0.9480527597599464 and parameters: {'C': 77.37688686246307, 'epsilon': 83.03765100556345, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:34,907] Trial 6 finished with value: 0.9480527597599474 and parameters: {'C': 77.54150859267655, 'epsilon': 24.456429256724775, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:34,931] Trial 7 finished with value: 0.9480527597599462 and parameters: {'C': 33.51482096105953, 'epsilon': 95.377334706877, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:34,954] Trial 8 finished with value: 0.9480527597599474 and parameters: {'C': 43.4062493256822, 'epsilon': 27.126715229981667, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:34,976] Trial 9 finished with value: 0.9480527597599474 and parameters: {'C': 84.66958057603607, 'epsilon': 24.07150921456597, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,006] Trial 10 finished with value: 0.9480527597599477 and parameters: {'C': 10.782991524289748, 'epsilon': 67.72774446639866, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,041] Trial 11 finished with value: 0.9480527597599464 and parameters: {'C': 60.126200963629095, 'epsilon': 98.50025800637485, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,100] Trial 12 finished with value: 0.9480527597599464 and parameters: {'C': 36.15658628782677, 'epsilon': 82.39198695358724, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,162] Trial 13 finished with value: 0.9480527597599462 and parameters: {'C': 28.269577363930544, 'epsilon': 76.18762011108011, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,206] Trial 14 finished with value: 0.9480527597599464 and parameters: {'C': 60.81925428223067, 'epsilon': 97.38060789930296, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,251] Trial 15 finished with value: 0.9480527597599477 and parameters: {'C': 42.612407329638415, 'epsilon': 52.412044631040104, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,299] Trial 16 finished with value: 0.9480527597599462 and parameters: {'C': 29.401041623767163, 'epsilon': 89.60153366977543, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,351] Trial 17 finished with value: 0.9480527597599462 and parameters: {'C': 49.41852632252185, 'epsilon': 74.36939079432177, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,401] Trial 18 finished with value: 0.9480527597599464 and parameters: {'C': 98.06264223591958, 'epsilon': 99.7480423211797, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,451] Trial 19 finished with value: 0.9480527597599476 and parameters: {'C': 18.889708792262603, 'epsilon': 12.368417639510326, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,496] Trial 20 finished with value: 0.9480527597599464 and parameters: {'C': 37.03222010698268, 'epsilon': 87.92833727135323, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,541] Trial 21 finished with value: 0.9480527597599462 and parameters: {'C': 27.77214037974312, 'epsilon': 76.20573646034144, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,577] Trial 22 finished with value: 0.9480527597599464 and parameters: {'C': 27.188813558026474, 'epsilon': 78.98898822029699, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,606] Trial 23 finished with value: 0.9480527597599477 and parameters: {'C': 10.609880189380672, 'epsilon': 68.5010724672316, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,641] Trial 24 finished with value: 0.9480527597599464 and parameters: {'C': 20.093770250589472, 'epsilon': 88.41000891370933, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,691] Trial 25 finished with value: 0.9480527597599477 and parameters: {'C': 31.961940009187124, 'epsilon': 61.54268284560145, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,721] Trial 26 finished with value: 0.9480527597599462 and parameters: {'C': 40.352322518215914, 'epsilon': 73.53035747035797, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,751] Trial 27 finished with value: 0.9480527597599464 and parameters: {'C': 45.29294742315939, 'epsilon': 82.45596687513364, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,819] Trial 28 finished with value: 0.9480527597599477 and parameters: {'C': 33.68687557363233, 'epsilon': 59.21743702037239, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,875] Trial 29 finished with value: 0.9480527597599477 and parameters: {'C': 50.13113679311652, 'epsilon': 46.98242765151489, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,906] Trial 30 finished with value: 0.9480527597599462 and parameters: {'C': 25.33567091530712, 'epsilon': 70.87565533787297, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,931] Trial 31 finished with value: 0.9480527597599462 and parameters: {'C': 30.11030226709531, 'epsilon': 90.78080349608088, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,966] Trial 32 finished with value: 0.9480527597599462 and parameters: {'C': 40.6068308866196, 'epsilon': 92.57177063865537, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:35,996] Trial 33 finished with value: 0.9480527597599462 and parameters: {'C': 32.51681080407933, 'epsilon': 85.75462789752295, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,034] Trial 34 finished with value: 0.9480527597599464 and parameters: {'C': 23.0053368766571, 'epsilon': 94.26252009946074, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,063] Trial 35 finished with value: 0.9480527597599462 and parameters: {'C': 38.1082640822263, 'epsilon': 79.24216355326625, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,095] Trial 36 finished with value: 0.9480527597599464 and parameters: {'C': 17.91060970759741, 'epsilon': 85.69391822094215, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,121] Trial 37 finished with value: 0.9480527597599464 and parameters: {'C': 29.842607054607576, 'epsilon': 93.12844290797105, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,151] Trial 38 finished with value: 0.9480527597599464 and parameters: {'C': 53.310488466084294, 'epsilon': 77.81438376996516, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,186] Trial 39 finished with value: 0.9480527597599462 and parameters: {'C': 45.557508281649675, 'epsilon': 89.3489477568804, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,216] Trial 40 finished with value: 0.9480527597599462 and parameters: {'C': 35.500611753298365, 'epsilon': 95.05183690589537, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,248] Trial 41 finished with value: 0.9480527597599464 and parameters: {'C': 49.89822805131738, 'epsilon': 71.76559757155286, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,279] Trial 42 finished with value: 0.9480527597599462 and parameters: {'C': 45.17667310058303, 'epsilon': 82.53455983593888, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,301] Trial 43 finished with value: 0.9480527597599464 and parameters: {'C': 56.21179869513486, 'epsilon': 75.60800177101808, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,339] Trial 44 finished with value: 0.9480527597599464 and parameters: {'C': 25.110902322507208, 'epsilon': 80.19259572206926, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,367] Trial 45 finished with value: 0.9480527597599477 and parameters: {'C': 39.325917826013985, 'epsilon': 66.33480204310779, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,396] Trial 46 finished with value: 0.9480527597599464 and parameters: {'C': 34.31380996349332, 'epsilon': 85.52557215795872, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,426] Trial 47 finished with value: 0.9480527597599464 and parameters: {'C': 47.90760696668642, 'epsilon': 97.94333302487343, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,461] Trial 48 finished with value: 0.9480527597599464 and parameters: {'C': 39.93305254741219, 'epsilon': 90.85915891939763, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,491] Trial 49 finished with value: 0.9480527597599464 and parameters: {'C': 16.232427015799587, 'epsilon': 73.81813456641959, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,521] Trial 50 finished with value: 0.9480527597599464 and parameters: {'C': 28.275861862882586, 'epsilon': 76.51386781916577, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,551] Trial 51 finished with value: 0.9480527597599462 and parameters: {'C': 28.311914109794593, 'epsilon': 70.06111928170664, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,586] Trial 52 finished with value: 0.9480527597599464 and parameters: {'C': 22.90329045461241, 'epsilon': 76.85588782760979, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,616] Trial 53 finished with value: 0.9480527597599462 and parameters: {'C': 36.89217025133842, 'epsilon': 81.85591671391693, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,652] Trial 54 finished with value: 0.9480527597599464 and parameters: {'C': 31.675633212552004, 'epsilon': 99.98167611558705, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,681] Trial 55 finished with value: 0.9480527597599462 and parameters: {'C': 24.9188905082314, 'epsilon': 86.12622217793275, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,712] Trial 56 finished with value: 0.9480527597599477 and parameters: {'C': 35.01133999538285, 'epsilon': 65.16341581509423, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,751] Trial 57 finished with value: 0.9480527597599477 and parameters: {'C': 42.800131285473675, 'epsilon': 68.39250177522874, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,826] Trial 58 finished with value: 0.9480527597599462 and parameters: {'C': 13.940445985995197, 'epsilon': 73.64607893455799, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,883] Trial 59 finished with value: 0.9480527597599464 and parameters: {'C': 20.147641940487347, 'epsilon': 95.03159966104889, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,922] Trial 60 finished with value: 0.9480527597599462 and parameters: {'C': 26.426275332270674, 'epsilon': 79.7652765816588, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,951] Trial 61 finished with value: 0.9480527597599462 and parameters: {'C': 41.94479760518233, 'epsilon': 72.76449733791578, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:36,991] Trial 62 finished with value: 0.9480527597599462 and parameters: {'C': 30.71946692441271, 'epsilon': 74.85191580346252, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,026] Trial 63 finished with value: 0.9480527597599462 and parameters: {'C': 37.9710464621196, 'epsilon': 83.90131532707359, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,051] Trial 64 finished with value: 0.9480527597599462 and parameters: {'C': 32.97821864637885, 'epsilon': 89.57106248088324, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,086] Trial 65 finished with value: 0.9480527597599477 and parameters: {'C': 40.96271014714393, 'epsilon': 62.521963232830274, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,116] Trial 66 finished with value: 0.9480527597599462 and parameters: {'C': 29.731146714879756, 'epsilon': 70.82112188762086, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,141] Trial 67 finished with value: 0.9480527597599462 and parameters: {'C': 35.557052105912334, 'epsilon': 87.6843199469087, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,185] Trial 68 finished with value: 0.9480527597599462 and parameters: {'C': 22.589575561907054, 'epsilon': 80.32653888540362, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,225] Trial 69 finished with value: 0.9480527597599464 and parameters: {'C': 48.003093108896884, 'epsilon': 77.50651328053095, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,258] Trial 70 finished with value: 0.9480527597599477 and parameters: {'C': 43.10444005161865, 'epsilon': 67.8610001437054, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,292] Trial 71 finished with value: 0.9480527597599464 and parameters: {'C': 26.399054684328643, 'epsilon': 70.61821671041685, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,324] Trial 72 finished with value: 0.9480527597599462 and parameters: {'C': 32.35584607146903, 'epsilon': 75.41073538004865, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,351] Trial 73 finished with value: 0.9480527597599464 and parameters: {'C': 28.798980130848797, 'epsilon': 83.51689681323282, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,381] Trial 74 finished with value: 0.9480527597599464 and parameters: {'C': 24.36958908797376, 'epsilon': 78.26582882235172, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,416] Trial 75 finished with value: 0.9480527597599462 and parameters: {'C': 39.30780103163566, 'epsilon': 72.64629361349232, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,446] Trial 76 finished with value: 0.9480527597599462 and parameters: {'C': 27.30431859373777, 'epsilon': 92.23698288140382, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,481] Trial 77 finished with value: 0.9480527597599464 and parameters: {'C': 20.575859210185605, 'epsilon': 81.17676819607229, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,511] Trial 78 finished with value: 0.9480527597599477 and parameters: {'C': 37.47266888937557, 'epsilon': 63.74841074382261, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,541] Trial 79 finished with value: 0.9480527597599477 and parameters: {'C': 32.67464161046893, 'epsilon': 67.07042745823661, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,576] Trial 80 finished with value: 0.9480527597599464 and parameters: {'C': 44.38773483907933, 'epsilon': 96.46674404270544, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,601] Trial 81 finished with value: 0.9480527597599462 and parameters: {'C': 34.673320012330265, 'epsilon': 91.49646828625602, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,636] Trial 82 finished with value: 0.9480527597599464 and parameters: {'C': 30.44244534541309, 'epsilon': 87.13303718639978, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,671] Trial 83 finished with value: 0.9480527597599462 and parameters: {'C': 24.221668955377655, 'epsilon': 89.85130228707938, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,701] Trial 84 finished with value: 0.9480527597599462 and parameters: {'C': 26.98916376239042, 'epsilon': 83.44881665284832, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,731] Trial 85 finished with value: 0.9480527597599462 and parameters: {'C': 21.896749240032364, 'epsilon': 75.91216587315633, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,767] Trial 86 finished with value: 0.9480527597599462 and parameters: {'C': 36.7068170187924, 'epsilon': 97.00744833847595, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,796] Trial 87 finished with value: 0.9480527597599462 and parameters: {'C': 31.21511023180425, 'epsilon': 93.3370850692995, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,821] Trial 88 finished with value: 0.9480527597599464 and parameters: {'C': 51.768999061786346, 'epsilon': 78.65002053561977, 'kernel': 'linear'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,861] Trial 89 finished with value: 0.9480527597599462 and parameters: {'C': 46.80751087465776, 'epsilon': 84.63314601692254, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,892] Trial 90 finished with value: 0.9480527597599462 and parameters: {'C': 29.33712255722316, 'epsilon': 74.41884681366993, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,921] Trial 91 finished with value: 0.9480527597599464 and parameters: {'C': 39.305511456185044, 'epsilon': 88.7362416107984, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,956] Trial 92 finished with value: 0.9480527597599464 and parameters: {'C': 34.22311605804316, 'epsilon': 91.79684846451539, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:37,981] Trial 93 finished with value: 0.9480527597599462 and parameters: {'C': 40.99422637122547, 'epsilon': 94.21331198134338, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:38,022] Trial 94 finished with value: 0.9480527597599462 and parameters: {'C': 45.11042668811193, 'epsilon': 87.46767256953679, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:38,051] Trial 95 finished with value: 0.9480527597599462 and parameters: {'C': 42.7059067049944, 'epsilon': 69.32755601081989, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:38,081] Trial 96 finished with value: 0.9480527597599464 and parameters: {'C': 33.44993815656017, 'epsilon': 72.17226369208012, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:38,116] Trial 97 finished with value: 0.9480527597599464 and parameters: {'C': 49.8225178801405, 'epsilon': 81.02099338837618, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:38,141] Trial 98 finished with value: 0.9480527597599464 and parameters: {'C': 36.078448256681654, 'epsilon': 85.63576736743485, 'kernel': 'sigmoid'}. Best is trial 4 with value: 0.9480527597599462. [I 2023-05-21 15:49:38,176] Trial 99 finished with value: 0.9480527597599464 and parameters: {'C': 46.77228807526704, 'epsilon': 98.73164543050024, 'kernel': 'poly'}. Best is trial 4 with value: 0.9480527597599462.
fig = plot_optimization_history(svm_study)
fig.update_layout(
title='Optimization History Plot',
xaxis_title='Trials',
yaxis_title='Objective Value',
legend_title='Objective',
font=dict(
family="Courier New, monospace",
size=18,
color="#7f7f7f"
)
)
fig.show()
The provided code below demonstrates the optimization and evaluation of a Gradient Boosting model using Optuna.
The function gb_objective is defined to optimize the hyperparameters of a Gradient Boosting Regressor using Optuna. The following hyperparameters are considered:
n_estimators: The number of boosting stages or trees in the gradient boosting model.max_depth: The maximum depth of each individual tree in the gradient boosting model.learning_rate: The learning rate or shrinkage factor that controls the contribution of each tree in the model.subsample: The fraction of samples to be used for training each individual tree.min_samples_split: The minimum number of samples required to split an internal node in each tree.min_samples_leaf: The minimum number of samples required to be at a leaf node in each tree.max_features: The maximum number of features to consider when looking for the best split at each node.random_state: The random seed for reproducibility.Inside the function, a Gradient Boosting Regressor model is initialized with the suggested hyperparameters. The negative mean squared error (RMSE) is calculated using cross-validation with 5 folds. The objective function returns the average RMSE across all folds.
A study object gb_study is created using Optuna, specifying the direction of optimization as "minimize". The gb_objective function is then optimized using the study object with 100 trials.
The best RMSE value and the corresponding best hyperparameters are printed. The Gradient Boosting Regressor model is then instantiated with the best hyperparameters.
The model is fitted on the training data (X_train and y_train), and predictions are made on the test data (X_test). The root mean squared error (RMSE) and mean absolute error (MAE) of the model's predictions are calculated using the ground truth (y_test).
Finally, the calculated RMSE and MAE values are stored in the results dictionary under the key "gradient boosting" for further analysis or comparison.
def gb_objective(trial):
params = {
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
"max_depth": trial.suggest_int("max_depth", 2, 10),
"learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
"subsample": trial.suggest_float("subsample", 0.1, 1.0),
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
"max_features": trial.suggest_int("max_features", 1, 4),
"random_state": RANDOM_STATE
}
gb = GradientBoostingRegressor(**params)
score = np.sqrt(-cross_val_score(gb, X_train, y_train, cv=5, scoring="neg_mean_squared_error"))
return score.mean()
gb_study = optuna.create_study(direction="minimize")
gb_study.optimize(gb_objective, n_trials=1)
print(f"Best RMSE: {gb_study.best_value:.5f}")
print(f"Best params: {gb_study.best_params}")
# Define the SVM model with the best hyperparameters
gb = GradientBoostingRegressor(**gb_study.best_params)
# Fit the SVM model on the training data
gb.fit(X_train, y_train)
# Use the SVM model to make predictions on the test data
y_pred = gb.predict(X_test)
# Calculate the SVM model's RMSE and MAE on the test data
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
name = 'gradient boosting'
results[name] = {"rmse": rmse, "mae": mae}
results
[I 2023-05-21 15:49:38,254] A new study created in memory with name: no-name-8d4b709f-fdb3-47c1-8f08-c32b5b6b237d [I 2023-05-21 15:49:39,325] Trial 0 finished with value: 0.9702598352528893 and parameters: {'n_estimators': 283, 'max_depth': 4, 'learning_rate': 0.04002850032737316, 'subsample': 0.8765247137862936, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 1}. Best is trial 0 with value: 0.9702598352528893.
Best RMSE: 0.97026
Best params: {'n_estimators': 283, 'max_depth': 4, 'learning_rate': 0.04002850032737316, 'subsample': 0.8765247137862936, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 1}
{'svm': {'rmse': 1.2346956112872556, 'mae': 1.0287711743150938},
'gradient boosting': {'rmse': 0.94647337792175, 'mae': 0.7100730651288648}}
Below shows a comparison of RMSE and MAE for all algorithms. We can see that gradient boosting outperforms other models because of their ability to handle non linear relationships. Also ensembles also give atleast 2% boost in performance
names = list(results.keys())
rmse = [results[name]['rmse'] for name in names]
mae = [results[name]['mae'] for name in names]
# create bar chart
fig = go.Figure(data=[
go.Bar(name='RMSE', x=names, y=rmse),
go.Bar(name='MAE', x=names, y=mae),
])
# update layout and show chart
fig.update_layout( font = font,
xaxis_title='Model', yaxis_title='Score')
fig.show()